Description: here we only use the raw mean for analysis, because it has a higher heritibility rate I am thinking about doing TBBC3 2018 and 2019 sepetately. And also combined.
# remove all the variables
rm(list = ls())
# read in dataset
est_tbbc3_18 <- read.csv("/Users/qiuyuting/Documents/data_file/est_tbbc3_18.csv", header = TRUE)
est_tbbc3_19 <- read.csv("/Users/qiuyuting/Documents/data_file/est_tbbc3_19.csv", header = TRUE)
est_z025_19 <- read.csv("/Users/qiuyuting/Documents/data_file/est_z025_19.csv", header = TRUE)
est_tbbc3_1819 <- read.csv("/Users/qiuyuting/Documents/data_file/est_tbbc3_1819.csv", header = TRUE)
est_tbbc3_1819 <- est_tbbc3_1819[,-1]
library(corrplot)
corrplot 0.84 loaded
colnames(est_tbbc3_18)[1] <- "genotype"
round(cor(est_tbbc3_18[,-1]),2)
inci_rate mean mummi_rate mummi_rate1 new_mean1
inci_rate 1.00 0.87 0.70 0.59 0.61
mean 0.87 1.00 0.90 0.82 0.88
mummi_rate 0.70 0.90 1.00 0.96 0.82
mummi_rate1 0.59 0.82 0.96 1.00 0.83
new_mean1 0.61 0.88 0.82 0.83 1.00
colnames(est_tbbc3_19)[1] <- "genotype"
round(cor(est_tbbc3_19[,-1]),2)
inci_rate mean mummi_rate mummi_rate1 new_mean1 severity severity_1
inci_rate 1.00 0.86 -0.07 -0.21 0.53 0.01 0.18
mean 0.86 1.00 0.00 -0.14 0.84 0.26 0.37
mummi_rate -0.07 0.00 1.00 0.88 -0.01 0.09 -0.04
mummi_rate1 -0.21 -0.14 0.88 1.00 -0.08 0.04 -0.11
new_mean1 0.53 0.84 -0.01 -0.08 1.00 0.29 0.39
severity 0.01 0.26 0.09 0.04 0.29 1.00 0.89
severity_1 0.18 0.37 -0.04 -0.11 0.39 0.89 1.00
colnames(est_z025_19)[1] <- "genotype"
round(cor(est_z025_19[,-1]),2)
inci_rate mean mummi_rate mummi_rate1 new_mean1 severity severity_1
inci_rate 1.00 0.91 0.31 0.21 0.69 0.29 0.44
mean 0.91 1.00 0.38 0.29 0.87 0.50 0.60
mummi_rate 0.31 0.38 1.00 0.96 0.33 0.31 0.34
mummi_rate1 0.21 0.29 0.96 1.00 0.30 0.31 0.34
new_mean1 0.69 0.87 0.33 0.30 1.00 0.48 0.61
severity 0.29 0.50 0.31 0.31 0.48 1.00 0.91
severity_1 0.44 0.60 0.34 0.34 0.61 0.91 1.00
colnames(est_tbbc3_1819)[1] <- "genotype"
round(cor(na.omit(est_tbbc3_1819[,-1])),2)
mean newmean1 inci_rate severity severity1
mean 1.00 0.85 0.86 0.27 0.40
newmean1 0.85 1.00 0.56 0.30 0.41
inci_rate 0.86 0.56 1.00 0.02 0.23
severity 0.27 0.30 0.02 1.00 0.90
severity1 0.40 0.41 0.23 0.90 1.00
pairs(est_tbbc3_18[,-1])
pairs(est_tbbc3_19[,-1])
pairs(est_z025_19[,-1])
pairs(est_tbbc3_1819[,-1])
# 2018 all info in, without new variables
PCA_tbbc3_18 <- princomp(est_tbbc3_18[,-c(1,5,6)], cor = T, scores = T) #cor equals T means use the correlation matrix
summary(PCA_tbbc3_18)
Importance of components:
Comp.1 Comp.2 Comp.3
Standard deviation 1.6273363 0.5479225 0.2270626
Proportion of Variance 0.8827412 0.1000730 0.0171858
Cumulative Proportion 0.8827412 0.9828142 1.0000000
# One PC would be enough
loadings(PCA_tbbc3_18) # positively correlated
Loadings:
Comp.1 Comp.2 Comp.3
inci_rate 0.559 0.741 0.372
mean 0.604 -0.795
mummi_rate 0.568 -0.669 0.479
Comp.1 Comp.2 Comp.3
SS loadings 1.000 1.000 1.000
Proportion Var 0.333 0.333 0.333
Cumulative Var 0.333 0.667 1.000
biplot(PCA_tbbc3_18)
tbbc3_18_scores <- PCA_tbbc3_18$scores[,1]
boxplot(est_tbbc3_18$mean)
boxplot(tbbc3_18_scores) # some outliers, but much better than the old one, and there are sone check lines
shapiro.test(est_tbbc3_18$mean)
Shapiro-Wilk normality test
data: est_tbbc3_18$mean
W = 0.94795, p-value = 0.001563
shapiro.test(tbbc3_18_scores) # not quite normal
Shapiro-Wilk normality test
data: tbbc3_18_scores
W = 0.94928, p-value = 0.001876
library(ggpubr)
Loading required package: ggplot2
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Loading required package: magrittr
ggqqplot(tbbc3_18_scores)
summary(PCA_tbbc3_19) # 2PC needed
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4
Standard deviation 1.3777306 1.0522000 0.9426800 0.32571142
Proportion of Variance 0.4745354 0.2767812 0.2221614 0.02652198
Cumulative Proportion 0.4745354 0.7513166 0.9734780 1.00000000
unclass(PCA_tbbc3_19$loadings)
Comp.1 Comp.2 Comp.3 Comp.4
inci_rate 0.6760937 0.20082431 0.20962372 0.67721838
mean 0.7056058 -0.02702519 0.03491572 -0.70722763
mummi_rate -0.0338013 -0.71219681 0.70060309 0.02807995
severity 0.2094638 -0.67209742 -0.68117112 0.20103696
PCA_tbbc3_19$sdev ^2
Comp.1 Comp.2 Comp.3 Comp.4
1.8981415 1.1071249 0.8886457 0.1060879
biplot(PCA_tbbc3_19)
PCA_tbbc3_19.1 <- princomp(est_tbbc3_19[,c(2,3,7)], cor = T, scores = T)
summary(PCA_tbbc3_19.1)
Importance of components:
Comp.1 Comp.2 Comp.3
Standard deviation 1.3773633 0.9980421 0.32677567
Proportion of Variance 0.6323766 0.3320293 0.03559411
Cumulative Proportion 0.6323766 0.9644059 1.00000000
unclass(PCA_tbbc3_19.1$loadings)
Comp.1 Comp.2 Comp.3
inci_rate 0.6749597 0.295373611 0.6761537
mean 0.7063195 0.006391578 -0.7078644
severity 0.2134061 -0.955360444 0.2043141
PCA_tbbc3_19.1$sdev ^2
Comp.1 Comp.2 Comp.3
1.8971297 0.9960880 0.1067823
biplot(PCA_tbbc3_19.1)
PCA_z025_19 <- princomp(est_z025_19[,-c(1,5,6,8)], cor = T, scores = T)
summary(PCA_z025_19) # alomost need 2 pc, but one PC would be fine
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4
Standard deviation 1.5505137 0.9162318 0.8316714 0.25445825
Proportion of Variance 0.6010232 0.2098702 0.1729193 0.01618725
Cumulative Proportion 0.6010232 0.8108934 0.9838127 1.00000000
PCA_z025_19$loadings # mummirate and severity can see some opposite direction
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4
inci_rate 0.560 0.492 0.150 0.650
mean 0.609 0.291 -0.737
mummi_rate 0.378 -0.656 0.653
severity 0.415 -0.494 -0.741 0.187
Comp.1 Comp.2 Comp.3 Comp.4
SS loadings 1.00 1.00 1.00 1.00
Proportion Var 0.25 0.25 0.25 0.25
Cumulative Var 0.25 0.50 0.75 1.00
PCA_z025_19$sdev ^ 2
Comp.1 Comp.2 Comp.3 Comp.4
2.4040929 0.8394808 0.6916773 0.0647490
biplot(PCA_z025_19)
z025_19_scores <- PCA_z025_19$scores[,1]
boxplot(z025_19_scores) # looks nice
shapiro.test(z025_19_scores) # much better
Shapiro-Wilk normality test
data: z025_19_scores
W = 0.98331, p-value = 0.02263
ggqqplot(z025_19_scores) # nice
PCA_z025_19.1 <- princomp(est_z025_19[,-c(1,4,5,6,8)], cor = T, scores = T)
summary(PCA_z025_19.1) # need 1 PC here
Importance of components:
Comp.1 Comp.2 Comp.3
Standard deviation 1.473153 0.8745145 0.25504005
Proportion of Variance 0.723393 0.2549252 0.02168181
Cumulative Proportion 0.723393 0.9783182 1.00000000
unclass(PCA_z025_19.1$loadings)
Comp.1 Comp.2 Comp.3
inci_rate 0.6156361 0.4426308 0.6519741
mean 0.6597818 0.1628836 -0.7335918
severity 0.4309062 -0.8817862 0.1917627
PCA_z025_19.1$sdev ^ 2
Comp.1 Comp.2 Comp.3
2.17017896 0.76477561 0.06504543
biplot(PCA_z025_19.1)
# severity show sometthing opposite
tbbc3_19_scores.1 <- PCA_z025_19.1$scores[,1]
boxplot(tbbc3_19_scores.1)
shapiro.test(tbbc3_19_scores.1) #almost normal
Shapiro-Wilk normality test
data: tbbc3_19_scores.1
W = 0.98585, p-value = 0.05222
ggqqplot(tbbc3_19_scores.1)
# get PCA scores
tbbc3_18_scores <- as.data.frame(PCA_tbbc3_18$scores)
tbbc3_19_scores <- as.data.frame(PCA_tbbc3_19$scores)
tbbc3_19_scores.1 <- as.data.frame(PCA_tbbc3_19.1$scores)
tbbc3_1819_scores <- as.data.frame(PCA_tbbc3_1819$scores)
z025_scores <- as.data.frame(PCA_z025_19$scores)
z025_scores.1 <- as.data.frame(PCA_z025_19.1$scores)
# make a better dataframe
tbbc3_18_scores$PCA <- "tbbc3_18_scores"
tbbc3_18_scores <- cbind(tbbc3_18_scores, est_tbbc3_18[,1])
tbbc3_19_scores$PCA <- "tbbc3_19_scores"
tbbc3_19_scores <- cbind(tbbc3_19_scores, est_tbbc3_19[,1])
tbbc3_1819_scores$PCA <- "tbbc3_1819_scores"
tbbc3_1819_scores <- cbind(tbbc3_1819_scores, est_tbbc3_1819[,1])
Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 93, 94
write.csv(tbbc3_18_scores, "/Users/qiuyuting/Documents/data_file/tbbc3_18_scores.csv")
write.csv(tbbc3_19_scores, "/Users/qiuyuting/Documents/data_file/tbbc3_19_scores.csv")
write.csv(tbbc3_1819_scores, "/Users/qiuyuting/Documents/data_file/tbbc3_1819_scores.csv")
write.csv(tbbc3_19_scores.1, "/Users/qiuyuting/Documents/data_file/tbbc3_19_scores1.csv")
write.csv(z025_scores, "/Users/qiuyuting/Documents/data_file/z025_scores.csv")
write.csv(z025_scores.1, "/Users/qiuyuting/Documents/data_file/z025_scores1.csv")